1 Introduction

This workbook performs the basic data exploration of the dataset.

dataexp_level_exclusion_threshold <- 100

dataexp_cat_level_count <- 40
dataexp_hist_bins_count <- 50

2 Load Data

First we load the dataset.

data(freMTPLfreq)
data(freMTPLsev)

policy_data_tbl <- freMTPLfreq
claim_data_tbl  <- freMTPLsev %>%
    group_by(PolicyID) %>%
    summarise(claim_count = n()
             ,claim_total = sum(ClaimAmount)
             )

rawdata_tbl <- policy_data_tbl %>%
    left_join(claim_data_tbl, by = 'PolicyID') %>%
    as_data_frame

glimpse(rawdata_tbl)
## Observations: 413,169
## Variables: 12
## $ PolicyID    <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19...
## $ ClaimNb     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Exposure    <dbl> 0.09, 0.84, 0.52, 0.45, 0.15, 0.75, 0.81, 0.05, 0.76, 0.34, 0.10,...
## $ Power       <fctr> g, g, f, f, g, g, d, d, d, i, f, f, e, e, e, e, e, e, i, i, h, h...
## $ CarAge      <int> 0, 0, 2, 2, 0, 0, 1, 0, 9, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ DriverAge   <int> 46, 46, 38, 38, 41, 41, 27, 27, 23, 44, 32, 32, 33, 33, 33, 54, 6...
## $ Brand       <fctr> Japanese (except Nissan) or Korean, Japanese (except Nissan) or ...
## $ Gas         <fctr> Diesel, Diesel, Regular, Regular, Diesel, Diesel, Regular, Regul...
## $ Region      <fctr> R72, R72, R31, R31, R52, R52, R72, R72, R31, R11, R24, R24, R11,...
## $ Density     <int> 76, 76, 3003, 3003, 60, 60, 695, 695, 7887, 27000, 23, 23, 1746, ...
## $ claim_count <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ claim_total <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...

2.1 Perform Quick Data Cleaning

### _TEMPLATE_
### Do simple datatype transforms and save output in data_tbl
data_tbl <- rawdata_tbl

names(data_tbl) <- data_tbl %>% names %>% clean_names

glimpse(data_tbl)
## Observations: 413,169
## Variables: 12
## $ policy_id   <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19...
## $ claim_nb    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ exposure    <dbl> 0.09, 0.84, 0.52, 0.45, 0.15, 0.75, 0.81, 0.05, 0.76, 0.34, 0.10,...
## $ power       <fctr> g, g, f, f, g, g, d, d, d, i, f, f, e, e, e, e, e, e, i, i, h, h...
## $ car_age     <int> 0, 0, 2, 2, 0, 0, 1, 0, 9, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ driver_age  <int> 46, 46, 38, 38, 41, 41, 27, 27, 23, 44, 32, 32, 33, 33, 33, 54, 6...
## $ brand       <fctr> Japanese (except Nissan) or Korean, Japanese (except Nissan) or ...
## $ gas         <fctr> Diesel, Diesel, Regular, Regular, Diesel, Diesel, Regular, Regul...
## $ region      <fctr> R72, R72, R31, R31, R52, R52, R72, R72, R31, R11, R24, R24, R11,...
## $ density     <int> 76, 76, 3003, 3003, 60, 60, 695, 695, 7887, 27000, 23, 23, 1746, ...
## $ claim_count <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ claim_total <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...

We want to replace missing values on the claim data with zeros, and also need to check consistency between the claim counts in both the policy and claim data sets. In the event of a discrepancy, we will go with the claim count from the claim data.

data_tbl <- data_tbl %>%
    mutate(claim_count = ifelse(is.na(claim_count), 0, claim_count)
          ,claim_total = ifelse(is.na(claim_total), 0, claim_total)
           )

data_tbl %>% filter(claim_count != claim_nb)
## # A tibble: 0 x 12
## # ... with 12 variables: policy_id <int>, claim_nb <int>, exposure <dbl>, power <fctr>,
## #   car_age <int>, driver_age <int>, brand <fctr>, gas <fctr>, region <fctr>,
## #   density <int>, claim_count <dbl>, claim_total <dbl>
data_tbl <- data_tbl %>%
    select(-claim_nb)

data_tbl %>% glimpse
## Observations: 413,169
## Variables: 11
## $ policy_id   <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19...
## $ exposure    <dbl> 0.09, 0.84, 0.52, 0.45, 0.15, 0.75, 0.81, 0.05, 0.76, 0.34, 0.10,...
## $ power       <fctr> g, g, f, f, g, g, d, d, d, i, f, f, e, e, e, e, e, e, i, i, h, h...
## $ car_age     <int> 0, 0, 2, 2, 0, 0, 1, 0, 9, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ driver_age  <int> 46, 46, 38, 38, 41, 41, 27, 27, 23, 44, 32, 32, 33, 33, 33, 54, 6...
## $ brand       <fctr> Japanese (except Nissan) or Korean, Japanese (except Nissan) or ...
## $ gas         <fctr> Diesel, Diesel, Regular, Regular, Diesel, Diesel, Regular, Regul...
## $ region      <fctr> R72, R72, R31, R31, R52, R52, R72, R72, R31, R11, R24, R24, R11,...
## $ density     <int> 76, 76, 3003, 3003, 60, 60, 695, 695, 7887, 27000, 23, 23, 1746, ...
## $ claim_count <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ claim_total <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...

2.2 Create Derived Variables

We now create derived features useful for modelling. These values are new variables calculated from existing variables in the data.

data_tbl <- data_tbl %>%
    mutate(policy_id = as.character(policy_id))

glimpse(data_tbl)
## Observations: 413,169
## Variables: 11
## $ policy_id   <chr> "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "1...
## $ exposure    <dbl> 0.09, 0.84, 0.52, 0.45, 0.15, 0.75, 0.81, 0.05, 0.76, 0.34, 0.10,...
## $ power       <fctr> g, g, f, f, g, g, d, d, d, i, f, f, e, e, e, e, e, e, i, i, h, h...
## $ car_age     <int> 0, 0, 2, 2, 0, 0, 1, 0, 9, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ driver_age  <int> 46, 46, 38, 38, 41, 41, 27, 27, 23, 44, 32, 32, 33, 33, 33, 54, 6...
## $ brand       <fctr> Japanese (except Nissan) or Korean, Japanese (except Nissan) or ...
## $ gas         <fctr> Diesel, Diesel, Regular, Regular, Diesel, Diesel, Regular, Regul...
## $ region      <fctr> R72, R72, R31, R31, R52, R52, R72, R72, R31, R11, R24, R24, R11,...
## $ density     <int> 76, 76, 3003, 3003, 60, 60, 695, 695, 7887, 27000, 23, 23, 1746, ...
## $ claim_count <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ claim_total <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...

2.3 Check Missing Values

Before we do anything with the data, we first check for missing values in the dataset. In some cases, missing data is coded by a special character rather than as a blank, so we first correct for this.

### _TEMPLATE_
### ADD CODE TO CORRECT FOR DATA ENCODING HERE

With missing data properly encoded, we now visualise the missing data in a number of different ways.

2.3.1 Univariate Missing Data

We first examine a simple univariate count of all the missing data:

row_count <- data_tbl %>% nrow

missing_univariate_tbl <- data_tbl %>%
    summarise_all(funs(sum(is.na(.)))) %>%
    gather('variable','missing_count') %>%
    mutate(missing_prop = missing_count / row_count)

ggplot(missing_univariate_tbl) +
    geom_bar(aes(x = fct_reorder(variable, -missing_prop), weight = missing_prop)) +
    scale_y_continuous(labels = comma) +
    xlab("Variable") +
    ylab("Missing Value Proportion") +
    theme(axis.text.x = element_text(angle = 90))

We remove all variables where all of the entries are missing

remove_vars <- missing_univariate_tbl %>%
    filter(missing_count == row_count) %>%
    .[['variable']]

lessmiss_data_tbl <- data_tbl %>%
    select(-one_of(remove_vars))

With these columns removed, we repeat the exercise.

missing_univariate_tbl <- lessmiss_data_tbl %>%
    summarise_all(funs(sum(is.na(.)))) %>%
    gather('variable','missing_count') %>%
    mutate(missing_prop = missing_count / row_count)

ggplot(missing_univariate_tbl) +
    geom_bar(aes(x = fct_reorder(variable, -missing_prop), weight = missing_prop)) +
    scale_y_continuous(labels = comma) +
    xlab("Variable") +
    ylab("Missing Value Proportion") +
    theme(axis.text.x = element_text(angle = 90))

To reduce the scale of this plot, we look at the top twenty missing data counts.

missing_univariate_top_tbl <- missing_univariate_tbl %>%
    arrange(desc(missing_count)) %>%
    top_n(n = 50, wt = missing_count)

ggplot(missing_univariate_top_tbl) +
    geom_bar(aes(x = fct_reorder(variable, -missing_prop), weight = missing_prop)) +
    scale_y_continuous(labels = comma) +
    xlab("Variable") +
    ylab("Missing Value Proportion") +
    theme(axis.text.x = element_text(angle = 90))

2.3.2 Multivariate Missing Data

It is useful to get an idea of what combinations of variables tend to have variables with missing values simultaneously, so to construct a visualisation for this we create a count of all the times given combinations of variables have missing values, producing a heat map for these combination counts.

missing_plot_tbl <- rawdata_tbl %>%
    mutate_all(funs(is.na)) %>%
    mutate_all(funs(as.numeric)) %>%
    mutate(label = do.call(paste0, (.))) %>%
    group_by(label) %>%
    summarise_all(funs(sum)) %>%
    arrange(desc(label)) %>%
    select(-label) %>%
    mutate(rowid = do.call(pmax, (.))) %>%
    gather('col','count', -rowid) %>%
    mutate(Proportion = count / row_count
          ,rowid      = round(rowid / row_count, 4)
    )

ggplot(missing_plot_tbl) +
    geom_tile(aes(x = col, y = as.factor(rowid), fill = Proportion), height = 0.8) +
    scale_fill_continuous(labels = comma) +
    scale_x_discrete(position = 'top') +
    xlab("Variable") +
    ylab("Missing Value Proportion") +
    theme(axis.text.x = element_text(angle = 90))

This visualisation takes a little explaining.

Each row represents a combination of variables with simultaneous missing values. For each row in the graphic, the coloured entries show which particular variables are missing in that combination. The proportion of rows with that combination is displayed in both the label for the row and the colouring for the cells in the row.

2.4 Inspect High-level-count Categorical Variables

With the raw data loaded up we now remove obvious unique or near-unique variables that are not amenable to basic exploration and plotting.

coltype_lst <- create_coltype_list(data_tbl)

catvar_valuecount_tbl <- data_tbl %>%
    summarise_at(coltype_lst$split$discrete
                ,function(x) length(unique(x))) %>%
    gather('var_name', 'level_count') %>%
    arrange(-level_count)

print(catvar_valuecount_tbl)
## # A tibble: 5 x 2
##    var_name level_count
##       <chr>       <int>
## 1 policy_id      413169
## 2     power          12
## 3    region          10
## 4     brand           7
## 5       gas           2
row_count <- nrow(data_tbl)

cat(paste0("Dataset has ", row_count, " rows\n"))
## Dataset has 413169 rows

Now that we a table of the counts of all the categorical variables we can automatically exclude unique variables from the exploration, as the level count will match the row count.

unique_vars <- catvar_valuecount_tbl %>%
    filter(level_count == row_count) %>%
    .[["var_name"]]

print(unique_vars)
## [1] "policy_id"
explore_data_tbl <- data_tbl %>%
    select(-one_of(unique_vars))

Having removed the unique identifier variables from the dataset, we may also wish to exclude categoricals with high level counts also, so we create a vector of those variable names.

highcount_vars <- catvar_valuecount_tbl %>%
    filter(level_count >= dataexp_level_exclusion_threshold
          ,level_count < row_count) %>%
    .[["var_name"]]

cat(paste0(highcount_vars, collapse = ', '))

We now can continue doing some basic exploration of the data. We may also choose to remove some extra columns from the dataset.

### You may want to comment out these next few lines to customise which
### categoricals are kept in the exploration.
drop_vars <- c(highcount_vars)

if(length(drop_vars) > 0) {
    explore_data_tbl <- explore_data_tbl %>%
        select(-one_of(drop_vars))

    cat(paste0(drop_vars, collapse = ', '))
}

3 Univariate Data Exploration

Now that we have loaded the data we can prepare it for some basic data exploration. We first exclude the variables that are unique identifiers or similar, and tehen split the remaining variables out into various categories to help with the systematic data exploration.

coltype_lst <- create_coltype_list(explore_data_tbl)

print(coltype_lst)
## $split
## $split$continuous
## [1] "exposure"    "car_age"     "driver_age"  "density"     "claim_count" "claim_total"
## 
## $split$discrete
## [1] "power"  "brand"  "gas"    "region"
## 
## 
## $columns
##     exposure        power      car_age   driver_age        brand          gas 
## "continuous"   "discrete" "continuous" "continuous"   "discrete"   "discrete" 
##       region      density  claim_count  claim_total 
##   "discrete" "continuous" "continuous" "continuous"

3.1 Logical Variables

Logical variables only take two values: TRUE or FALSE. It is useful to see missing data as well though, so we also plot the count of those.

logical_vars <- coltype_lst$split$logical

for(plot_varname in logical_vars) {
    cat("--\n")
    cat(paste0(plot_varname, '\n'))

    na_count <- explore_data_tbl %>% .[[plot_varname]] %>% is.na %>% sum

    explore_plot <- ggplot(explore_data_tbl) +
        geom_bar(aes_(x = plot_varname)) +
        xlab(plot_varname) +
        ylab("Count") +
        scale_y_continuous(labels = comma) +
        ggtitle(paste0('Barplot of Counts for Variable: ', plot_varname
                      ,' (', na_count, ' missing values)')) +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5))

    plot(explore_plot)
}

3.2 Numeric Variables

Numeric variables are usually continuous in nature, though we also have integer data.

numeric_vars <- coltype_lst$split$continuous

for(plot_varname in numeric_vars) {
    cat("--\n")
    cat(paste0(plot_varname, '\n'))

    plot_var <- explore_data_tbl %>% .[[plot_varname]]
    na_count <- plot_var %>% is.na %>% sum

    plot_var %>% summary %>% print

    explore_plot <- ggplot(explore_data_tbl) +
        geom_histogram(aes_string(x = plot_varname), bins = dataexp_hist_bins_count) +
        geom_vline(xintercept = mean  (plot_var, na.rm = TRUE), colour = 'red',   size = 1.5) +
        geom_vline(xintercept = median(plot_var, na.rm = TRUE), colour = 'green', size = 1.5) +
        xlab(plot_varname) +
        ylab("Count") +
        scale_x_continuous(labels = comma) +
        scale_y_continuous(labels = comma) +
        ggtitle(paste0('Histogram Plot for Variable: ', plot_varname
                      ,' (', na_count, ' missing values)')
               ,subtitle = '(red line is mean, green line is median)')

    print(explore_plot)
}
## --
## exposure
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00273 0.20000 0.54000 0.56109 1.00000 1.99000

## --
## car_age
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    3.00    7.00    7.53   12.00  100.00

## --
## driver_age
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    18.0    34.0    44.0    45.3    54.0    99.0

## --
## density
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       2      67     287    1985    1410   27000

## --
## claim_count
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.0392  0.0000  4.0000

## --
## claim_total
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0      83       0 2036833

3.3 Categorical Variables

Categorical variables only have values from a limited, and usually fixed, number of possible values

categorical_vars <- coltype_lst$split$discrete

for(plot_varname in categorical_vars) {
    cat("--\n")
    cat(paste0(plot_varname, '\n'))

    na_count <- explore_data_tbl %>% .[[plot_varname]] %>% is.na %>% sum

    plot_tbl <- explore_data_tbl %>%
        .[[plot_varname]] %>%
        as.character %>%
        fct_lump(n = dataexp_cat_level_count) %>%
        fct_count

    explore_plot <- ggplot(plot_tbl) +
        geom_bar(aes(x = fct_reorder(f, -n), weight = n)) +
        xlab(plot_varname) +
        ylab("Count") +
        scale_y_continuous(labels = comma) +
        ggtitle(paste0('Barplot of Counts for Variable: ', plot_varname
                      ,' (', na_count, ' missing values)')) +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5))

    plot(explore_plot)
}
## --
## power

## --
## brand

## --
## gas

## --
## region

3.4 Date/Time Variables

Date/Time variables represent calendar or time-based data should as time of the day, a date, or a timestamp.

datetime_vars <- coltype_lst$split$datetime

for(plot_varname in datetime_vars) {
    cat("--\n")
    cat(paste0(plot_varname, '\n'))

    plot_var <- explore_data_tbl %>% .[[plot_varname]]
    na_count <- plot_var %>% is.na %>% sum

    plot_var %>% summary %>% print

    explore_plot <- ggplot(explore_data_tbl) +
        geom_histogram(aes_string(x = plot_varname), bins = dataexp_hist_bins_count) +
        xlab(plot_varname) +
        ylab("Count") +
        scale_y_continuous(labels = comma) +
        ggtitle(paste0('Barplot of Dates/Times in Variable: ', plot_varname
                      ,' (', na_count, ' missing values)'))

    plot(explore_plot)
}

4 Bivariate Data Exploration

We now move on to looking at bivariate plots of the data set.

4.1 Pairs Plots

Pairs plots area very useful way of getting a quick idea of the relationships between variables in a data set.

Unfortunately, they do not scale well. Too many rows (say more than 5,000) can slow down the rendering, and more than 10 columns can make the plots uninterpretable as each cell is too small.

The technique is useful, so to circumvent these issues we sample the dataset. We select random columns and rows, and make a pairs plot of the subset, repeating this process for a number of iterations.

dataexp_pairsplot_itercount <-     3
dataexp_pairsplot_colcount  <-     5
dataexp_pairsplot_rowcount  <-  5000

if(ncol(data_tbl) > dataexp_pairsplot_colcount ||
   nrow(data_tbl) > dataexp_pairsplot_rowcount) {

    ### Ugly hack to work around current dplyr bug for mutate_if
    if(any(sapply(explore_data_tbl, is.logical))) {
        conv_tbl <- explore_data_tbl %>%
            mutate_if(is.logical, as.factor)
    } else {
        conv_tbl <- explore_data_tbl
    }

    conv_tbl <- conv_tbl %>%
        mutate_if(function(x) (is.character(x) || is.factor(x)) && !all(is.na(x))
                 ,function(x) fct_lump(x, n = 9))

    for(i in 1:dataexp_pairsplot_itercount) {
        cat("--\n")
        cat(paste0("Pairs plot iter: ", i, "\n"))

        pairs_tbl <- conv_tbl %>%
            create_ggpairs_tbl(sample_cols = dataexp_pairsplot_colcount
                              ,sample_rows = dataexp_pairsplot_rowcount
                               )

        cat(paste0("Columns: ", paste0(names(pairs_tbl), collapse = ', '), "\n"))

        pairs_tbl %>%
            ggpairs(cardinality_threshold = NULL
                   ,lower = list(combo = wrap('facethist', bins = 25))
                   ) %>%
            print
    }
} else {
    ggpairs(data_tbl) %>% print
}
## --
## Pairs plot iter: 1
## Columns: car_age, driver_age, gas, claim_count, claim_total

## --
## Pairs plot iter: 2
## Columns: exposure, power, gas, density, claim_count

## --
## Pairs plot iter: 3
## Columns: power, driver_age, brand, region, density

4.2 Facet Plots on Variables

We want to look at how the variables split on the logical variables as this is a very natural way to observe the data.

### _TEMPLATE_
facet_varname <- 'gas'


facet_count_max <- 3
facet_formula   <- formula(paste0("~ as.factor(", facet_varname, ")"))

4.2.1 Logical Variables

For logical variables we facet on barplots of the levels, comparing TRUE, FALSE and missing data.

logical_vars <- logical_vars[!logical_vars %in% facet_varname]

for(plot_varname in logical_vars) {
    cat("--\n")
    cat(paste0(plot_varname, '\n'))

    filter_formula <- formula(paste0("~ !is.na(", plot_varname, ")"))

    plot_tbl <- data_tbl %>% filter_(filter_formula)

    facet_count <- plot_tbl %>%
        .[[facet_varname]] %>%
        unique %>%
        length %>%
        min(facet_count_max)

    explore_plot <- ggplot(plot_tbl) +
        geom_bar(aes_string(x = plot_varname)) +
        facet_wrap(facet_formula, scales = 'free') +
        xlab(plot_varname) +
        ylab("Count") +
        scale_y_continuous(labels = comma) +
        ggtitle(paste0(facet_varname, '-Faceted Barplots for Variable: ', plot_varname)) +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5))

    plot(explore_plot)
}

4.2.2 Numeric Variables

For numeric variables, we facet on histograms of the data.

for(plot_varname in numeric_vars) {
    cat("--\n")
    cat(paste0(plot_varname, '\n'))

    filter_formula <- formula(paste0("~ !is.na(", plot_varname, ")"))

    plot_tbl <- data_tbl %>% filter_(filter_formula)

    facet_count <- plot_tbl %>%
        .[[facet_varname]] %>%
        unique %>%
        length %>%
        min(facet_count_max)

    explore_plot <- ggplot(plot_tbl) +
        geom_histogram(aes_string(x = plot_varname), bins = dataexp_hist_bins_count) +
        facet_wrap(facet_formula, scales = 'free') +
        xlab(plot_varname) +
        ylab("Count") +
        scale_x_continuous(labels = comma) +
        scale_y_continuous(labels = comma) +
        ggtitle(paste0(facet_varname, '-Faceted Histogram for Variable: ', plot_varname)) +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5))

    print(explore_plot)
}
## --
## exposure

## --
## car_age

## --
## driver_age

## --
## density

## --
## claim_count

## --
## claim_total

4.2.3 Categorical Variables

We treat categorical variables like logical variables, faceting the barplots of the different levels of the data.

categorical_vars <- categorical_vars[!categorical_vars %in% facet_varname]

for(plot_varname in categorical_vars) {
    cat("--\n")
    cat(paste0(plot_varname, '\n'))

    filter_formula <- formula(paste0("~ !is.na(", plot_varname, ")"))

    plot_tbl <- data_tbl %>% filter_(filter_formula)

    facet_count <- plot_tbl %>%
        .[[facet_varname]] %>%
        unique %>%
        length %>%
        min(facet_count_max)

    explore_plot <- ggplot(plot_tbl) +
        geom_bar(aes_string(x = plot_varname)) +
        facet_wrap(facet_formula, scales = 'free') +
        xlab(plot_varname) +
        ylab("Count") +
        scale_y_continuous(labels = comma) +
        ggtitle(paste0(facet_varname, '-Faceted Histogram for Variable: ', plot_varname)) +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5))

    plot(explore_plot)
}
## --
## power

## --
## brand

## --
## region

4.2.4 Date/Time Variables

Like the univariate plots, we facet on histograms of the years in the dates.

for(plot_varname in datetime_vars) {
    cat("--\n")
    cat(paste0(plot_varname, '\n'))

    filter_formula <- formula(paste0("~ !is.na(", plot_varname, ")"))

    plot_tbl <- data_tbl %>%
        filter_(filter_formula) %>%
        mutate_(plot_vartmp = plot_varname) %>%
        mutate(plot_var = year(plot_vartmp))

    facet_count <- plot_tbl %>%
        .[[facet_varname]] %>%
        unique %>%
        length %>%
        min(facet_count_max)

    explore_plot <- ggplot(plot_tbl) +
        geom_bar(aes(x = plot_var)) +
        facet_wrap(facet_formula, scales = 'free') +
        xlab(plot_varname) +
        ylab("Count") +
        scale_y_continuous(labels = comma) +
        ggtitle(paste0(facet_varname, '-Faceted Histogram for Variable: ', plot_varname))

    plot(explore_plot)
}

5 Multivariate Visualisation

Having looked at the pairs plots we also look at multivariate plots of all the data. We do this using techniques known as ‘multidimensional scaling’ or MDS.

Many of these techniques do not scale well beyond a few thousand data points, so we repeat our sampling trick as before and create multiple plots from these samples.

numeric_vars <- create_coltype_list(explore_data_tbl)$split$continuous

5.1 Multidimensional Scaling

We start with classic multidimensional scaling, also called ‘principal coordinates analysis’, which is done in R via the function cmdscale.

mds_iter_count   <-    4
mds_sample_count <- 2000

row_ids <- data_tbl %>%
    select(one_of(numeric_vars)) %>%
    complete.cases


### _TEMPLATE_
### Choosing the first variable in the categorical list by default. You probably
### want to change that.
colour_var <- 'power'


input_tbl <- data_tbl %>%
    select(one_of(c(numeric_vars, colour_var))) %>%
    filter(row_ids)

construct_mds_plot <- function(mds_tbl) {
    num_mds_dist <- mds_tbl %>% select(one_of(numeric_vars)) %>% dist

    num_mds <- cmdscale(num_mds_dist, k = 2, eig = TRUE)

    mds_tbl <- mds_tbl %>%
        mutate(mds_d1 = num_mds$points[,1]
              ,mds_d2 = num_mds$points[,2])

    mds_plot <- ggplot(mds_tbl) +
        geom_point(aes_string(x = 'mds_d1', y = 'mds_d2', colour = colour_var)) +
        xlab("MDS Dim 1") +
        ylab("MDS Dim 2")

    return(mds_plot)
}


mds_lst <- create_sampled_output(input_tbl, construct_mds_plot, mds_sample_count, mds_iter_count)


for(i in 1:length(mds_lst)) {
    cat("--\n")
    cat(paste0("MDS plot iter: ", i, "\n"))

    mds_lst[[i]] %>% print
}
## --
## MDS plot iter: 1

## --
## MDS plot iter: 2

## --
## MDS plot iter: 3

## --
## MDS plot iter: 4

5.2 t-SNE Plots

One standard method for doing this is t-SNE, t-distributed Stochastic Neighbourhood Embedding. This algorithm is a type of dimensionality reduction - it constructs a lower-dimensional set of data from the original dataset by attempting the minimise the Kullback-Lieber divergence between the original and target datasets.

t-SNE requires unique datapoints, so to ensure we do not pass repeated rows at any point, we may add a small amount of noise to the numeric columns to ensure uniqueness - t-SNE is a probabilistic process so this should not affect our output very much.

As with previous methods, we take samples from larger datasets and plot outputs from multiple samples.

tsne_iter_count   <-     4
tsne_sample_count <- 10000

row_ids <- data_tbl %>%
    select(one_of(numeric_vars)) %>%
    complete.cases


### _TEMPLATE_
### Choosing the first variable in the categorical list by default. You probably
### want to change that.
colour_var <- 'power'


input_tbl <- data_tbl %>%
    select(one_of(c(numeric_vars, colour_var))) %>%
    jitter_numeric_variables %>%
    filter(row_ids)


construct_tsne_plot <- function(tsne_tbl) {
    data_tsne <- Rtsne(tsne_tbl %>% select(one_of(numeric_vars)), theta = 0.9)

    tsne_tbl$tsne_d1 <- data_tsne$Y[,1]
    tsne_tbl$tsne_d2 <- data_tsne$Y[,2]

    tsne_plot <- ggplot(tsne_tbl) +
        geom_point(aes_string(x = 'tsne_d1', y = 'tsne_d2', colour = colour_var)
                  ,size = 0.5) +
        xlab("t-SNE Dim 1") +
        ylab("t-SNE Dim 2")

    return(tsne_plot)
}


tsne_lst <- create_sampled_output(input_tbl, construct_tsne_plot, tsne_sample_count, tsne_iter_count)


for(i in 1:length(tsne_lst)) {
    cat("--\n")
    cat(paste0("t-SNE plot iter: ", i, "\n"))

    tsne_lst[[i]] %>% print
}
## --
## t-SNE plot iter: 1

## --
## t-SNE plot iter: 2

## --
## t-SNE plot iter: 3

## --
## t-SNE plot iter: 4

6 Outlier Identification

Another important part of data exploration is the identification of possible outliers, and we approach this in multiple ways.

In keeping with the methodical approach we start with a univariate perspective, looking at each numerical variable by itself and plotting the values in the variable both with and without the outliers.

6.1 Univariate Outlier Plots

for(plot_varname in numeric_vars) {
    cat("--\n")
    cat(paste0(plot_varname, '\n'))

    plot_var <- data_tbl %>% .[[plot_varname]]

    outlier_point <- identify_univariate_outliers(plot_var)

    no_outlier_vals <- plot_var[outlier_point]

    all_plot <- ggplot() +
        geom_histogram(aes(x = plot_var), bins = dataexp_hist_bins_count) +
        xlab(plot_varname) +
        ylab("Count") +
        scale_x_continuous(labels = comma) +
        scale_y_continuous(labels = comma) +
        ggtitle("All Data") +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5))

    no_outlier_plot <- ggplot() +
        geom_histogram(aes(x = no_outlier_vals), bins = dataexp_hist_bins_count) +
        xlab(plot_varname) +
        ylab("Count") +
        scale_x_continuous(labels = comma) +
        scale_y_continuous(labels = comma) +
        ggtitle("No Outliers") +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5))

    plot_grid(all_plot, no_outlier_plot, ncol = 2) %>% print
}
## --
## exposure

## --
## car_age

## --
## driver_age

## --
## density

## --
## claim_count

## --
## claim_total

We use the above plots to decide if we need to remove certain extreme values from the dataset.

# We place basic logic for identifying univariate outliers here
#data_filt_tbl <- data_tbl %>% mutate(uni_outlier = ifelse(val > 100))